import warnings
warnings.filterwarnings('ignore')
!pip install openpyxl==3.0.0
!pip install fasttext
!pip install nlpaug
!pip install openpyxl
!pip install -q xlrd
!pip install -q xlrd
!pip install -U -q PyDrive
!pip install wordcloud
import time
import glob
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import random
import sklearn
from sklearn.utils import class_weight
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import np_utils
from xgboost import XGBClassifier
import string
from tqdm import tqdm
tqdm().pandas()
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('words')
import collections
import matplotlib.cm as cm
from matplotlib import rcParams
import time
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from collections import defaultdict
import matplotlib.ticker as ticker
import matplotlib.cm as cm
import matplotlib as mpl
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import random
import re
import pandas as pd
from nltk import sent_tokenize
from tqdm import tqdm
#from albumentations.core.transforms_interface import DualTransform, BasicTransform
import openpyxl
from wordcloud import WordCloud, STOPWORDS
from sklearn.decomposition import PCA
import fasttext
import fasttext.util
df = pd.read_excel('input.xlsx', index_col=0)
df.reset_index(inplace = True)
df
df_processing = df.copy()
df_processing['text'] = df['Short description']+" "+df["Description"]
df_processing = df_processing[['text','Assignment group']]
df_processing['Assignment group']= df_processing['Assignment group'].str.replace('GRP','')
df_processing['Assignment group']= df_processing['Assignment group'].str.replace('_','')
df_processing['Assignment group']= df_processing['Assignment group'].astype(int)
df_processing['text']= df_processing['text'].astype(str)
df_processing['Assignment group'].value_counts()
top_20 = df_processing['Assignment group'].value_counts().nlargest(20).reset_index()
plt.subplots(figsize=(20,5))
sns.barplot(x='index', y='Assignment group', data=top_20)
plt.xlabel('Assignment Group')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.title('Assignment Group Distribution')
plt.ylim(0, 5000)
plt.show();
plt.figure(1,figsize=(16,15))
df_processing['Assignment group'].value_counts().sort_values().plot(kind = 'barh')
Group 0 has 46 % of records
Fewer groups have more than 1 % of records
other groups have less than 1% of records
value_count_df = df_processing['Assignment group'].value_counts().to_frame().reset_index()
value_count_df ['Per'] = df_processing['Assignment group'].value_counts(normalize=True)
GrpToBeMaintained = value_count_df[value_count_df['Assignment group'] >= 80]['index'].values
value_count_df[value_count_df['Assignment group'] >= 80]
GrpToBeMaintained
title_rating = df_processing.groupby('Assignment group').agg('count')
rating_labels = title_rating.text.sort_values().index
rating_counts = title_rating.text.sort_values()
plt.figure(1, figsize=(40,70))
the_grid = GridSpec(2, 2)
cmap = plt.get_cmap('Spectral')
colors = [cmap(i) for i in np.linspace(0, 1, 8)]
plt.subplot(the_grid[0, 1], aspect=1, title='Percentage of Each Group')
type_show_ids = plt.pie(rating_counts, labels=rating_labels, autopct='%1.1f%%', shadow=True, colors=colors)
plt.show()
def assignGroup(x):
if x in GrpToBeMaintained:
val = x
else:
val = 100
return val
df_sampled = df_processing[ df_processing['Assignment group'].isin(GrpToBeMaintained)]
df_sampled .isnull().sum()
df_sampled
def wl(text):
return len(text.split(" "))
fig_df = pd.DataFrame()
fig_df['word_length']=df_sampled['text'].apply(wl)
fig_df['char_length']=df_sampled['text'].apply(len)
fig_df['text']= df_sampled['text']
fig_df['Assignment group']= df_sampled['Assignment group']
fig_df[["text","word_length"]].sort_values(by = "word_length",ascending = False).head(10)
fig_df[["text","word_length"]].sort_values(by = "word_length",ascending = True).head(10)
fig_df[["text","char_length"]].sort_values(by = "char_length",ascending = False).head()
fig_df[["text","char_length"]].sort_values(by = "char_length",ascending = True).head()
df_nan = fig_df.loc[fig_df['word_length'] == 1]
#df.loc[df['column_name'] == some_value]
#df_processing[df_processing['text'].isna()]
df_nan
df_sampled.drop(df_nan.index)
fig_df['word_length'].hist()
fig_df['char_length'].hist()
fig_df.groupby('Assignment group').agg({'word_length': ['mean', 'min', 'max']})
fig_df.groupby('Assignment group').agg({'char_length': ['mean', 'min', 'max']})
sns.scatterplot(x='char_length',y='word_length',data=fig_df)
stopwords = set(STOPWORDS)
def show_wordcloud(data, title = None):
wordcloud = WordCloud(
background_color='black',
stopwords=stopwords,
max_words=200,
max_font_size=40,
scale=3,
random_state=1 # chosen at random by flipping a coin; it was heads
).generate(str(data))
fig = plt.figure(1, figsize=(15, 15))
plt.axis('off')
if title:
fig.suptitle(title, fontsize=20)
fig.subplots_adjust(top=2.3)
plt.imshow(wordcloud)
plt.show()
#show_wordcloud(df['reviews.text'])
show_wordcloud(df_processing['text'])
STOPWORDS = set(STOPWORDS)
words = set(nltk.corpus.words.words())
PUNCT_TO_REMOVE = string.punctuation
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
- Conversion into Lower case
- Remove URL from input
- Remove html string from input
- Remove Emoji from input
- Remove special character from input
- Remove punctuation from input
- Remove Meaningless words
- Remove stopwords
- Lemmatisation
# Functions for preprocessing
def remove_upper_case( text):
'''
Function to transform upper string in title words
@param text: (str) text
@return: (str) text without upper words
'''
sentences = text.split("\n")
new_sentences = []
for i in sentences:
words = text.split()
stripped = [w.title() if w.isupper() else w for w in words]
new_sentences.append(" ".join(stripped))
return "\n".join(new_sentences)
def remove_URL( text):
'''
Function to remove url from text.
@param text: (str) sentence
@return: (str) clean text
'''
url = re.compile(r'https?://\S+|www\.\S+')
return url.sub(r'',text)
def remove_html( text):
'''
Function regex to clean text from html balises.
@param text: (str) sentence
@return: (str) clean text
'''
html=re.compile(r'<.*?>')
return html.sub(r'',text)
def remove_emoji( text):
'''
Function to remove emojis, symbols and pictograms etc from text
@param text: (str) sentences
@return: (str) clean text
'''
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
def remove_special_char( text):
'''
Function to remove emojis, symbols and pictograms etc from text
@param text: (str) sentences
@return: (str) clean text
'''
spcl_char_pattern = re.compile(r'[^`~!@#$%^&*()_+={}\[\]|\\:;“’<,>.?๐฿]*$')
return spcl_char_pattern.sub(r'',text)
return spcl_char_pattern.sub(r'', text)
def clean_sent_org(sent):
return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
if w.lower() in words or not w.isalpha())
def clean_sent(sent):
return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
if w.lower() in words )
def remove_punctuation(text):
"""custom function to remove the punctuation"""
return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
def remove_stopwords(text):
"""custom function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in STOPWORDS])
def lemmatize_words(text):
pos_tagged_text = nltk.pos_tag(text.split())
return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
#df["text_lemmatized"] = df["text"].apply(lambda text: lemmatize_words(text))
TEXT = 'text'
df_sampled[TEXT] = df_sampled[TEXT].apply(remove_upper_case)
df_sampled[TEXT] = df_sampled[TEXT].apply(remove_URL)
df_sampled[TEXT] = df_sampled[TEXT].apply(remove_html)
df_sampled[TEXT] = df_sampled[TEXT].apply(remove_emoji)
df_sampled[TEXT] = df_sampled[TEXT].apply(remove_special_char)
df_sampled[TEXT] = df_sampled[TEXT].apply(clean_sent)
df_sampled[TEXT] = df_sampled[TEXT].apply( remove_punctuation)
df_sampled[TEXT] = df_sampled[TEXT].apply( remove_stopwords)
df_sampled[TEXT] = df_sampled[TEXT].apply(lemmatize_words)
df_sampled[TEXT] = df_sampled[TEXT].str.strip().replace('', 'NaN')
df_sampled[df_sampled[TEXT]=='NaN']
df_sampled = df_sampled[df_sampled[TEXT] != 'NaN']
df_sampled
top_20_grp = df_sampled['Assignment group'].value_counts().nlargest(20).index
for i in top_20_grp.values:
print("Word cloud for class ",i)
show_wordcloud(df_sampled[df_sampled['Assignment group'] == i]['text'])
def plt_freq_words(all_lines):
filtered_words = [word for word in all_lines.split() ]
counted_words = collections.Counter(filtered_words)
words = []
counts = []
for letter, count in counted_words.most_common(20):
words.append(letter)
counts.append(count)
colors = cm.rainbow(np.linspace(0, 1, 20))
rcParams['figure.figsize'] = 20, 10
plt.title('Top words in the headlines vs their count')
plt.xlabel('Count')
plt.ylabel('Words')
plt.barh(words, counts, color=colors)
plt_freq_words( ' '.join(df_sampled['text'].str.lower()))
plt_freq_words(' '.join(df_sampled[df_sampled['Assignment group'] == 0]['text'].str.lower()))
plt_freq_words(' '.join(df_sampled[df_sampled['Assignment group'] == 8]['text'].str.lower()))
plt_freq_words(' '.join(df_sampled[df_sampled['Assignment group'] == 24]['text'].str.lower()))
plt_freq_words(' '.join(df_sampled[df_sampled['Assignment group'] == 12]['text'].str.lower()))
def generate_N_grams(text,ngram):
words=[word for word in text.split(" ") if word not in STOPWORDS]
#print("Sentence after removing stopwords:",words)
temp=zip(*[words[i:] for i in range(0,ngram)])
ans=[' '.join(ngram) for ngram in temp]
return ans
Grp0Values=defaultdict(int)
Grp8Values=defaultdict(int)
Grp24Values=defaultdict(int)
Grp12Values=defaultdict(int)
#get the count of every word in both the columns of df_train and df_test dataframes
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==0].text:
for word in generate_N_grams(text,1):
Grp0Values[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==8].text:
for word in generate_N_grams(text,1):
Grp8Values[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==24].text:
for word in generate_N_grams(text,1):
Grp24Values[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==12].text:
for word in generate_N_grams(text,1):
Grp12Values[word]+=1
df_0Values=pd.DataFrame(sorted(Grp0Values.items(),key=lambda x:x[1],reverse=True))
df_8Values=pd.DataFrame(sorted(Grp8Values.items(),key=lambda x:x[1],reverse=True))
df_24Values=pd.DataFrame(sorted(Grp24Values.items(),key=lambda x:x[1],reverse=True))
df_12Values=pd.DataFrame(sorted(Grp12Values.items(),key=lambda x:x[1],reverse=True))
plt.figure(1,figsize=(16,4))
plt.bar(df_0Values[0][:10],df_0Values[1][:10], color ='green',
width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 0")
plt.savefig("positive-unigram.png")
plt.show()
Grp0Values_2=defaultdict(int)
Grp8Values_2=defaultdict(int)
Grp24Values_2=defaultdict(int)
Grp12Values_2=defaultdict(int)
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==0].text:
for word in generate_N_grams(text,2):
Grp0Values_2[word]+=1
for text in df_sampled[df_sampled['Assignment group']==8].text:
for word in generate_N_grams(text,2):
Grp8Values_2[word]+=1
for text in df_sampled[df_sampled['Assignment group']==24].text:
for word in generate_N_grams(text,2):
Grp24Values_2[word]+=1
for text in df_sampled[df_sampled['Assignment group']==12].text:
for word in generate_N_grams(text,2):
Grp12Values_2[word]+=1
df_0Values_2=pd.DataFrame(sorted(Grp0Values_2.items(),key=lambda x:x[1],reverse=True))
df_8Values_2=pd.DataFrame(sorted(Grp8Values_2.items(),key=lambda x:x[1],reverse=True))
df_24Values_2=pd.DataFrame(sorted(Grp24Values_2.items(),key=lambda x:x[1],reverse=True))
df_12Values_2=pd.DataFrame(sorted(Grp12Values_2.items(),key=lambda x:x[1],reverse=True))
plt.figure(1,figsize=(16,4))
plt.bar(df_8Values_2[0][:10],df_8Values_2[1][:10], color ='green',
width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 8 BIGRAM ANALYSIS")
plt.savefig("bigram8.png")
plt.show()
plt.figure(1,figsize=(16,4))
plt.bar(df_24Values_2[0][:10],df_24Values_2[1][:10], color ='green',
width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 24 BIGRAM ANALYSIS")
plt.savefig("positive-unigram.png")
plt.show()
plt.figure(1,figsize=(16,4))
plt.bar(df_12Values_2[0][:10],df_12Values_2[1][:10], color ='green',
width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 12 BIGRAM ANALYSIS")
plt.savefig("positive-unigram.png")
plt.show()
Grp0Values_3=defaultdict(int)
Grp8Values_3=defaultdict(int)
Grp24Values_3=defaultdict(int)
Grp12Values_3=defaultdict(int)
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==0].text:
for word in generate_N_grams(text,3):
Grp0Values_3[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==8].text:
for word in generate_N_grams(text,3):
Grp8Values_3[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==24].text:
for word in generate_N_grams(text,3):
Grp24Values_3[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==12].text:
for word in generate_N_grams(text,3):
Grp12Values_3[word]+=1
df_0Values_3=pd.DataFrame(sorted(Grp0Values_3.items(),key=lambda x:x[1],reverse=True))
df_8Values_3=pd.DataFrame(sorted(Grp8Values_3.items(),key=lambda x:x[1],reverse=True))
df_24Values_3=pd.DataFrame(sorted(Grp24Values_3.items(),key=lambda x:x[1],reverse=True))
df_12Values_3=pd.DataFrame(sorted(Grp12Values_3.items(),key=lambda x:x[1],reverse=True))
plt.figure(1,figsize=(16,4))
plt.bar(df_8Values_3[0][:10],df_8Values_3[1][:10], color ='green',
width = 0.4)
plt.xticks(rotation=90)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 8 TRIGRAM ANALYSIS")
plt.savefig("positive-unigram.png")
plt.show()
plt.figure(1,figsize=(16,4))
plt.bar(df_24Values_3[0][:10],df_24Values_3[1][:10], color ='green',
width = 0.4)
plt.xticks(rotation=90)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 24 TRIGRAM ANALYSIS")
plt.savefig("positive-unigram.png")
plt.show()
plt.figure(1,figsize=(16,4))
plt.bar(df_12Values_3[0][:10],df_12Values_3[1][:10], color ='green',
width = 0.4)
plt.xticks(rotation=90)
plt.xlabel("Words in dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 12 TRIGRAM ANALYSIS")
plt.savefig("Trigram_12.png")
plt.show()
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.word as naw
#aug = nac.KeyboardAug()
aug = naw.SynonymAug()
aug.augment("tomorrow is working day")
from random import shuffle
def augment_text(df,samples,Grp_No):
#aug_w2v.aug_p=pr
#new_text=[]
##selecting the minority class samples
df_n=df[df['Assignment group']==Grp_No].reset_index(drop=True)
#print(len(df_n))
## data augmentation loop
for i in tqdm(np.random.randint(0,len(df_n),samples)):
#print(i)
text = df_n.iloc[i]['text']
#print(" text"+ text)
new_text = aug.augment(text)
#print(new_text+ "aug text" )
#new_text.append(augmented_text)
#print("donr")
new=pd.DataFrame({'text':[new_text],'Assignment group':[Grp_No]})
df_n=(df_n.append(new).reset_index(drop=True))
## dataframe
return df_n
df_grp_8 = augment_text(df_sampled,200,8)
df_grp_12 = augment_text(df_sampled,400,12)
df_grp_9 = augment_text(df_sampled,400,9)
df_grp_2 = augment_text(df_sampled,400,2)
df_grp_24 = augment_text(df_sampled,400,24)
df_grp_6 = augment_text(df_sampled,400,6)
df_grp_3 = augment_text(df_sampled,500,3)
df_grp_19 = augment_text(df_sampled,500,19)
df_grp_13 = augment_text(df_sampled,500,13)
df_grp_10 = augment_text(df_sampled,500,10)
df_grp_5 = augment_text(df_sampled,500,5)
df_grp_14 = augment_text(df_sampled,500,14)
df_grp_25 = augment_text(df_sampled,500,25)
df_grp_4 = augment_text(df_sampled,500,4)
df_grp_29 = augment_text(df_sampled,500,29)
df_grp_18 = augment_text(df_sampled,500,18)
df_grp_17 = augment_text(df_sampled,500,17)
df_grp_16 = augment_text(df_sampled,500,16)
df_grp_33 = augment_text(df_sampled,500,33)
df_grp_0 = df_sampled[df_sampled['Assignment group']==0].sample(1000)
df_model = pd.concat([df_grp_8, df_grp_12, df_grp_9,df_grp_2,df_grp_24,df_grp_6,df_grp_3 ,df_grp_19,df_grp_13,df_grp_10,df_grp_5,df_grp_14,df_grp_25,df_grp_4,df_grp_29,df_grp_18,df_grp_17,df_grp_16,df_grp_33,df_grp_0 ])
title_rating = df_model.groupby('Assignment group').agg('count')
rating_labels = title_rating.text.sort_values().index
rating_counts = title_rating.text.sort_values()
plt.figure(1, figsize=(40,70))
the_grid = GridSpec(2, 2)
cmap = plt.get_cmap('Spectral')
colors = [cmap(i) for i in np.linspace(0, 1, 8)]
plt.subplot(the_grid[0, 1], aspect=1, title='Percentage of Each Group')
type_show_ids = plt.pie(rating_counts, labels=rating_labels, autopct='%1.1f%%', shadow=True, colors=colors)
plt.show()
df_model
LABEL = 'Assignment group'
from sklearn.utils import class_weight
train_x, rem_x, train_y, rem_y = model_selection.train_test_split(df_model[TEXT], df_model[LABEL], random_state=42, stratify=df_model[LABEL], train_size=0.8)
valid_x, test_x, valid_y, test_y = model_selection.train_test_split(rem_x,rem_y, test_size=0.5,random_state=42)
train_x_org, rem_x_org, train_y_org, rem_y_org = model_selection.train_test_split(df_sampled[TEXT], df_sampled[LABEL], random_state=42, stratify=df_sampled[LABEL], train_size=0.8)
valid_x_org, test_x_org, valid_y_org, test_y_org = model_selection.train_test_split(rem_x_org, rem_y_org, random_state=42, train_size=0.5)
df_results = pd.DataFrame()
def apply_pca (X):
cov_matrix_1 = np.cov(X.T)
#print("covariance matrix of part 2\n",cov_matrix_3)
eigen_value_1, eigen_vector_1 = np.linalg.eig(cov_matrix_1)
#how to find cumuative variance in PCA
tol_1 = sum(eigen_value_1)
var_eigen_value = [(i/tol_1) * 100 for i in sorted(eigen_value_1,reverse = True)]
cum_eigen_val = np.cumsum(var_eigen_value)
#print("cumulative variance",cum_eigen_val)
plt.plot(cum_eigen_val)
return eigen_value_1,var_eigen_value,cum_eigen_val
def apply_count_vect(xtrain,xvalid,xtest):
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',max_features = 300)
count_vect.fit(xtrain)
# transform the training and validation data using count vectorizer object
xtraincount = count_vect.transform(xtrain).toarray()
xvalidcount = count_vect.transform(xvalid).toarray()
xtestcount = count_vect.transform(xtest).toarray()
return xtraincount,xvalidcount,xtestcount
def apply_tf_idf_vect(xtrain,xvalid,xtest):
tfidf_vect = TfidfVectorizer(ngram_range = (1,3),max_features = 300)
tfidf_vect.fit(xtrain)
# word level tf-idf
#tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
tfidf_vect_train = tfidf_vect .transform(xtrain)
tfidf_vect_val = tfidf_vect .transform(xvalid)
tfidf_vect_test = tfidf_vect .transform(xtest)
return tfidf_vect_train.toarray(),tfidf_vect_val.toarray(),tfidf_vect_test.toarray()
def fit_pca(n_component,x_train,x_test,x_val):
pca_1 = PCA(n_components = n_component, random_state = 1)
pca_1.fit(x_train)
x_train_pca = pca_1.fit_transform(x_train)
x_test_pca = pca_1.transform(x_test)
x_valid_pca = pca_1.transform(x_val)
return x_train_pca,x_test_pca,x_valid_pca
def fit_tokenizer(xtrain,xvalid,xtest):
token = Tokenizer()
token.fit_on_texts(xtrain)
word_index = token.word_index
# convert text to sequence of tokens and pad them to ensure equal length vectors
train_seq = sequence.pad_sequences(token.texts_to_sequences(xtrain), maxlen=300)
valid_seq = sequence.pad_sequences(token.texts_to_sequences(xvalid), maxlen=300)
test_seq = sequence.pad_sequences(token.texts_to_sequences(xtest), maxlen=300)
return train_seq,valid_seq,test_seq
%%time
xtrain_count,xvalid_count,xtest_count = apply_count_vect(train_x,valid_x,test_x)
xtrain_count_org,xvalid_count_org,xtest_count_org = apply_count_vect(train_x_org,valid_x_org,test_x_org)
tfidf_vect_train_count,tfidf_vect_val_count,tfidf_vect_test_count = apply_tf_idf_vect(train_x,valid_x,test_x)
Word Embedding
num_words = 300 + 1
embedding_size = 50
# create a tokenizer
token = Tokenizer()
token.fit_on_texts(train_x)
word_index = token.word_index
# convert text to sequence of tokens and pad them to ensure equal length vectors
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=300)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=300)
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_x), maxlen=300)
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
import pickle
# Dict of metrics to use in the model selection
score_metrics = {'accuracy': accuracy_score,
#'balanced_accuracy': balanced_accuracy_score,
'precision_score': precision_score,
'recall_score': recall_score,
'f1-score': f1_score,
#'tp': tp, 'tn': tn,
#'fp': fp, 'fn': fn,
#'cohens_kappa':cohen_kappa_score,
#'matthews_corrcoef':matthews_corrcoef,
#"roc_auc":roc_auc_score
}
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
def report(clf, x, y, name='classifier', cv=5, dict_scoring=None, fit_params=None):
score = {'accuracy': make_scorer(accuracy_score),
#'prec': make_scorer(average_precision_score, average = 'weighted')
'f1-score': make_scorer(sklearn.metrics.f1_score, average = 'weighted'),
'precision_score': make_scorer(sklearn.metrics.precision_score, average = 'weighted'),
'recall_score': make_scorer(sklearn.metrics.recall_score, average = 'weighted')
}
scores = model_selection.cross_validate(clf, x, y.values, scoring=score,cv=cv, fit_params=fit_params ,return_train_score=True,verbose=0)
model_name = name+"_model"
model_name = clf.fit (x, y)
pickle.dump(model_name, open(name+"_model", 'wb'))
#print("pickled model")
index = []
value = []
index.append("Model")
value.append(name)
for i in scores:
if i == "estimator":
continue
for j in enumerate(scores[i]):
index.append(i+"_cv"+str(j[0]+1))
value.append(j[1])
index.append(i+"_mean")
value.append(np.mean(scores[i]))
index.append(i+"_std")
value.append(np.std(scores[i]))
#print(i,np.mean(scores[i]))
return pd.DataFrame(data=value, index=index).T
df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_count,train_y, name='NB_Count_Vectors', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), tfidf_vect_train_count,train_y, name='NB_TF-IDF', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), train_seq_x,train_y, name='NB_seq', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_count_org,train_y_org, name='NB_Count_Vectors_org', cv=5, dict_scoring=score_metrics))
Logistic Regression
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_count,train_y, name='LR_Count_Vector', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), tfidf_vect_train_count,train_y, name='LR_TF-IDF', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), train_seq_x,train_y, name='LR_seq', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_count_org,train_y_org, name='LR_count_vector_org', cv=5, dict_scoring=score_metrics))
XGBoost
fit_params={'early_stopping_rounds':5,'eval_set':[(xvalid_count, valid_y)], 'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), xtrain_count,train_y, name='XGB_Count_Vectors', cv=3, fit_params=fit_params, dict_scoring=score_metrics ))
fit_params={'early_stopping_rounds':5,'eval_set':[(tfidf_vect_val_count, valid_y)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), tfidf_vect_train_count,train_y, name='XGB_TF_IDF', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
fit_params={'early_stopping_rounds':10,'eval_set':[(valid_seq_x, valid_y)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), train_seq_x,train_y, name='XGB_seq', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
df_results
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_x)
encodings_train = tokenizer.texts_to_sequences(train_x)
encodings_test = tokenizer.texts_to_sequences(test_x)
encodings_val = tokenizer.texts_to_sequences(valid_x)
EMBEDDING_FILE = 'glove.6B.50d.txt'
embeddings = {}
for o in open(EMBEDDING_FILE, encoding='utf8'):
word = o.split(' ')[0]
# print(word)
embd = o.split(' ')[1:]
embd = np.asarray(embd, dtype='float32')
# print(embd)
embeddings[word] = embd
%%time
# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
words = []
for word, i in tqdm(word_index.items()):
embedding_vector = pretrained.get_word_vector(word) #embeddings_index.get(word)
words.append(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
Bi diretional LSTM
es = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='auto', patience=3)
check_p = tf.keras.callbacks.ModelCheckpoint("save_models/model.h5", save_best_only=True)
pretrained= fasttext.FastText.load_model('Fasttext.h5')
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
def cross_validate_NN(model, X, y, X_test, y_test, callbacks,name="NN", fit_params=None, scoring=None, n_splits=5):
#print(model.__class__.__name__)
# ---- Parameters initialisation
seed = 42
k = 1
np.random.seed(seed)
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
# Creation of list for each metric
if scoring==None:
dic_scoring = {}
if scoring!=None:
dic_score = scoring.copy()
dic_score["fit_time"] = None
dic_score["score_time"] = None
scorer = {}
for i in dic_score.keys():
scorer[i] = []
index = ["Model"]
results = [name]
# ---- Loop on k-fold for cross-valisation
for train, test in kfold.split(X, y):
# create model
#print(f"k-fold : {k}")
fit_start = time.time()
_model = model
_model.fit(X[train], y[train],
epochs=10, callbacks=[callbacks],
validation_split=0.2, verbose=False)
# with open('model_'+name, 'wb') as files:
# pickle.dump(Pklmodel, files)
fit_end = time.time() - fit_start
_acc = _model.evaluate(X_test, y_test, verbose=0)
score_start = time.time()
# y_pred = (model.predict(X_test)>0.5).astype(int)
y_pred = np.argmax(model.predict(X_test),axis=1)
score_end = time.time() - score_start
# print(dic_score)
# ---- save each metric
for i in dic_score.keys():
# print("name",i)
# f1_score(y_test, y_pred, average='macro')
# print(accuracy_score(y_test, y_pred))
if i == "fit_time":
scorer[i].append(fit_end)
index.append(i+'_cv'+str(k))
results.append(fit_end)
continue
if i == "score_time":
scorer[i].append(score_end)
index.append(i+'_cv'+str(k))
results.append(score_end)
continue
if i == "accuracy":
#
scorer[i].append(dic_score[i](y_test, y_pred))
index.append("test_"+i+'_cv'+str(k))
results.append(scorer[i][-1])
continue
scorer[i].append(dic_score[i](y_test, y_pred,average='weighted'))
index.append("test_"+i+'_cv'+str(k))
results.append(scorer[i][-1])
k+=1
# Compute mean and std for each metric
for i in scorer:
#print(np.mean(scorer[i]))
results.append(np.mean(scorer[i]))
results.append(np.std(scorer[i]))
if i == "fit_time":
index.append(i+"_mean")
index.append(i+"_std")
continue
if i == "score_time":
index.append(i+"_mean")
index.append(i+"_std")
continue
index.append("test_"+i+"_mean")
index.append("test_"+i+"_std")
# pickle.dump(model, open("_model", 'wb'))
model.save(name+'model.h5')
return pd.DataFrame(results, index=index).T
from tensorflow import keras
labels = set(df_model[LABEL].to_list())
def create_rnn_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
if pre_trained==False:
embedded = keras.layers.Embedding(len(word_index) + 1, 100)
else:
embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
model = keras.Sequential([
embedded,
keras.layers.SimpleRNN(40, return_sequences=True),
keras.layers.SimpleRNN(40, return_sequences=True),
keras.layers.SimpleRNN(40, return_sequences=True),
keras.layers.SimpleRNN(40),
keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])
if len(label)==2:
model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=1e-4),
loss=tf.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
else:
model.compile(optimizer='adam',
loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
#print(model.summary())
return model
df_results = df_results.append(cross_validate_NN(create_rnn_model(word_index, pre_trained = True), train_seq_x, train_y.values,valid_seq_x, valid_y.values, es, name="RNN_WE",scoring=score_metrics, n_splits=3))
def create_lstm_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
if pre_trained==False:
embedded = keras.layers.Embedding(len(word_index) +1, 100)
else:
embedded = keras.layers.Embedding(len(word_index)+1, 300, weights=[embedding_matrix], trainable=False)
model = keras.Sequential([
embedded,
keras.layers.LSTM(32),
keras.layers.Dropout(0.2),
keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])
if len(label)==2:
model.compile(optimizer='adam',
loss=tf.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
else:
model.compile(optimizer='adam',
loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
#print(model.summary())
return model
df_results = df_results.append(cross_validate_NN(create_lstm_model(word_index, pre_trained=True), tfidf_vect_train_count, train_y.values, tfidf_vect_val_count, valid_y.values, es, name="LSTM_c",scoring=score_metrics, n_splits=4))
df_results[[ "Model","test_accuracy_mean","train_accuracy_mean","test_f1-score_mean", "test_f1-score_std","test_recall_score_mean","test_precision_score_mean" ]][df_results["test_f1-score_mean"]<1].sort_values(by=["test_f1-score_mean"], ascending=False)
def create_cnn_gru_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
if pre_trained==False:
embedded = keras.layers.Embedding(len(word_index) + 1, 100)
else:
embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
model = keras.Sequential([
embedded,
keras.layers.Conv1D(128, 5, activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.MaxPooling1D(pool_size=4),
keras.layers.GRU(32),
keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])
if len(label)==2:
model.compile(optimizer='adam',
loss=tf.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
else:
model.compile(optimizer='adam',
loss=tf.losses.SparseCategoricalCrossentropy(from_logits=False),
metrics=['accuracy'])
#print(model.summary())
return model
df_results = df_results.append(cross_validate_NN(create_cnn_gru_model(word_index, pre_trained=False),tfidf_vect_train_count, train_y.values, tfidf_vect_val_count, valid_y.values, es, name="CNN_GRU_WE", scoring=score_metrics, n_splits=2))
def create_gru_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
if pre_trained==False:
embedded = keras.layers.Embedding(len(word_index) + 1, 100)
else:
embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
model = keras.Sequential([
embedded,
keras.layers.GRU(32),
keras.layers.Dropout(0.2),
keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])
if len(label)==2:
model.compile(optimizer='adam',
loss=tf.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
else:
model.compile(optimizer='adam',
loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
#print(model.summary())
return model
df_results = df_results.append(cross_validate_NN(create_gru_model(word_index, pre_trained=False), train_seq_x, train_y.values, valid_seq_x, valid_y.values, es, name="GRU_WE", scoring=score_metrics, n_splits=2))
df_results[[ "Model","test_accuracy_mean","train_accuracy_mean","test_f1-score_mean", "test_f1-score_std","test_recall_score_mean","test_precision_score_mean" ]][df_results["test_f1-score_mean"]<1].sort_values(by=["test_f1-score_mean"], ascending=False)
keras.models.load_model('GRU_WEmodel.h5').summary()
keras.models.load_model('RNN_seqmodel.h5').summary()
keras.models.load_model('LSTM_cmodel.h5').summary()
def create_bidirec_lstm_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
if pre_trained==False:
embedded = keras.layers.Embedding(len(word_index) + 1, 100)
else:
embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
model = keras.Sequential([
embedded,
keras.layers.Bidirectional(keras.layers.LSTM(32)),
keras.layers.Dropout(0.2),
keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])
if len(label)==2:
model.compile(optimizer='adam',
loss=tf.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
else:
model.compile(optimizer='adam',
loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
#print(model.summary())
return model
df_results = df_results.append(cross_validate_NN(create_bidirec_lstm_model(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="BiLSTM_WE",scoring=score_metrics, n_splits=5))
labels = GrpToBeMaintained
df_results = df_results.append(cross_validate_NN(create_rnn_model(word_index, pre_trained = True), train_seq_x, train_y.values,valid_seq_x, valid_y.values, es, name="RNN_seq",scoring=score_metrics, n_splits=3))
def create_bidirec_gru_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
if pre_trained==False:
embedded = keras.layers.Embedding(len(word_index) + 1, 100)
else:
embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
model = keras.Sequential([
embedded,
keras.layers.Bidirectional(keras.layers.GRU(32)),
keras.layers.Dropout(0.2),
keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])
if len(label)==2:
model.compile(optimizer='adam',
loss=tf.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
else:
model.compile(optimizer='adam',
loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
#print(model.summary())
return model
df_results = df_results.append(cross_validate_NN(create_bidirec_gru_model(word_index, pre_trained=pre_trained), train_seq_x, train_y.values, valid_seq_x, valid_y.values, es, name="BiGRU_WE",scoring=score_metrics, n_splits=5))